package com.ly.blog_data.parse;

import com.ly.blog_common.response.BlogException;
import com.ly.blog_common.response.BlogResponseStatus;
import com.ly.blog_data.entity.CatalogueEntity;
import com.ly.blog_data.entity.CatalogueRuleEntity;
import com.ly.blog_data.entity.Constant;
import lombok.extern.slf4j.Slf4j;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 目录解析操作
 * @author ly create at 2021/6/22 - 17:18
 **/
@Slf4j
public class HtmlCatalogueExplain {


    private String domainUrl; //去除最后的html剩余路径，取最多的。 跳转存在两种情况。 直接绝对路径/跳转。和 相对路径跳转
    private CatalogueRuleEntity catalogueRuleEntity;

    HtmlCatalogueExplain(CatalogueRuleEntity catalogueRuleEntity, String domainUrl){
        this.domainUrl = domainUrl;
        this.catalogueRuleEntity = catalogueRuleEntity;
    }

    //解析区间
    private String areaRule2Reg(){
        List<String> areaPreList = this.catalogueRuleEntity.getAreaPreList();
        List<String> areaPostList = this.catalogueRuleEntity.getAreaPostList();
        StringBuilder reg = new StringBuilder();
        reg.append(areaPreList.stream().reduce("",(all,now)-> {
            if(!"".equals(all)){
                return all + Constant.REG_ALL_ + now;
            }
            return now;
        }));
        reg.append(Constant.REG_ALL_);
        reg.append(areaPostList.stream().reduce("",(all,now)-> {
            if(!"".equals(all)){
                return all + Constant.REG_ALL_ + now;
            }
            return now;
        }));

        return reg.toString();
    }

    //解析内容
    private String contentRule2Reg(){
        String dataPre = this.catalogueRuleEntity.getDataPre();
        String dataCenter = this.catalogueRuleEntity.getDataCenter();
        String dataPost = this.catalogueRuleEntity.getDataPost();
        StringBuilder reg = new StringBuilder();
        reg.append(dataPre).append(Constant.REG_MIX_).append(dataCenter).append(Constant.REG_MIX_).append(dataPost);
        return reg.toString();
    }

    public List<CatalogueEntity> parserHtml(String catalogueUrlHtmlContent) {
        log.info("【获取目录】 --- 开始 -- ");
        List<CatalogueEntity> catalogueEntities = new ArrayList<>();
        Pattern pattern = Pattern.compile(areaRule2Reg());
        Matcher matcher = pattern.matcher(catalogueUrlHtmlContent);
        if(matcher.find()){
            String catalogueContent = matcher.group(this.catalogueRuleEntity.getAreaPreList().size()); //0 全部 1，第一个匹配 2，第二匹配 对应 pre的长度
            log.info("【获取目录】 --- 目录区间获取成功 -- 内容 {}",catalogueContent);

            pattern = Pattern.compile(contentRule2Reg());
            matcher = pattern.matcher(catalogueContent);
            while (matcher.find()){
                log.info("【获取目录】 --- 获取单个目录成功 -- 内容 {}",matcher.group(0));
                CatalogueEntity catalogueEntity = new CatalogueEntity();
                String url = matcher.group(1);
                //检测url是相对绝对并拼接正确完整路径
                catalogueEntity.setUrl(checkUrlAndAdd(url));
                catalogueEntity.setCatalogueName(matcher.group(2));
                catalogueEntities.add(catalogueEntity);
            }
            if(catalogueEntities.size() == 0){
                throw new BlogException(BlogResponseStatus.CATALOGUE_CONTENT_EMPTY);
            }
        }else{
            throw new BlogException(BlogResponseStatus.CATALOGUE_AREA_EMPTY);
        }
        log.info("【获取目录】 --- 结束 -- 获取目录长度:{}",catalogueEntities.size());

        return catalogueEntities;
    }

    public List<CatalogueEntity> parserHtmlCheck(String catalogueUrlHtmlContent, MsgList<String> checkResult) {
        checkResult.add("检测目录规则 - 开始 -----------\n");
        log.info("【检测目录】 --- 开始 -- ");
        List<CatalogueEntity> catalogueEntities = new ArrayList<>();
        Pattern pattern = Pattern.compile(areaRule2Reg());
        Matcher matcher = pattern.matcher(catalogueUrlHtmlContent);
        if(matcher.find()){
            checkResult.add("匹配目录区间规则 匹配成功 ---------\n");
            String catalogueContent = matcher.group(this.catalogueRuleEntity.getAreaPreList().size()); //0 全部 1，第一个匹配 2，第二匹配 对应 pre的长度
            log.info("【检测目录】 --- 目录区间获取成功 -- 内容 {}",catalogueContent);
            checkResult.addd("匹配目录区间规则 匹配内容为 ---------").addd(catalogueContent).addd("\n");

            pattern = Pattern.compile(contentRule2Reg());
            matcher = pattern.matcher(catalogueContent);
            int i = 0; //装载前五个和后五个
            while (matcher.find()){
                log.info("【检测目录】 --- 获取单个目录成功 -- 内容 {}",matcher.group(0));
                CatalogueEntity catalogueEntity = new CatalogueEntity();
                String url = matcher.group(1);
                //检测url是相对绝对并拼接正确完整路径
                catalogueEntity.setUrl(checkUrlAndAdd(url));
                catalogueEntity.setCatalogueName(matcher.group(2));
                catalogueEntities.add(catalogueEntity);
                if(i < 3 ) {
                    checkResult.addd("匹配目录内容规则 匹配成功 ---------\n");
                    checkResult.addd("目录内容 - catalogueEntity:").addd(catalogueEntity.toString());
                }
                i++;
            }
            if(catalogueEntities.size() == 0){
                checkResult.addd("匹配目录内容规则 匹配失败 ---------\n");
                return null;
            }
        }else{
            checkResult.addd("匹配目录区间规则 匹配失败 --------- 内容为：").addd(catalogueUrlHtmlContent).addd("\n");
            return null;
        }
        log.info("【检测目录】 --- 结束 -- 检测目录长度:{}",catalogueEntities.size());
        checkResult.add("检测目录规则 - 结束 -----------\n");
        return catalogueEntities;
    }

    private String checkUrlAndAdd(String url) {
        //绝对路径
        if(url.contains("http")){
            return url;
        }
        if(url.startsWith("/")){
            return domainUrl.substring(0,domainUrl.indexOf("/",8)) + url;
        }
        return domainUrl+"/"+url;
    }
}
